
# windows 上设置环境变量 HF_ENDPOINT 为https://hf-mirror.com
import requests
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration
from langchain.tools import BaseTool
from langchain.agents import initialize_agent, AgentType

# hf_model = "../models/blip-image-captioning-large"
hf_model='Salesforce/blip-image-captioning-large'
# 初始化处理器和模型
processor = BlipProcessor.from_pretrained(hf_model,clean_up_tokenization_spaces =False)
model = BlipForConditionalGeneration.from_pretrained(hf_model)
img_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg' 
raw_image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB')

# conditional image captioning
text = "this is "
inputs = processor(raw_image, text, return_tensors="pt")

out = model.generate(**inputs,max_new_tokens=1024)
print(processor.decode(out[0], skip_special_tokens=True))
# 执行结果: this is a woman and her dog on the beach playing with each other

# unconditional image captioning
inputs = processor(raw_image, return_tensors="pt")
out = model.generate(**inputs)
print(processor.decode(out[0], skip_special_tokens=True))
# 执行结果: woman sitting on the beach with her dog and a cell phone
